Fit Model ARMA(p, q) to residuals
Diagnose model(last residuals should be White Noise)
Simulate prediction
DT::datatable(potential_data)
summary(potential_data)
## Total A1 A2
## Min. : 3086 Min. : 81.0 Min. : 2888
## 1st Qu.:11692 1st Qu.:146.2 1st Qu.:11495
## Median :15630 Median :173.5 Median :15462
## Mean :16748 Mean :185.1 Mean :16563
## 3rd Qu.:23508 3rd Qu.:223.0 3rd Qu.:23365
## Max. :29869 Max. :339.0 Max. :29696
potential_data_ts = ts(potential_data,
start = c(2000,1),
freq = 12)
plot(potential_data_ts, main = 'ts-plot')
boxplot(potential_data_ts)
Due to extremely low numbers of A1 class and the similarity between Total and A2 class, let’s focus on A2 class as our research target.
target_data_ts = ts(target_data,start = c(2000,1),freq = 12)
boxplot(target_data_ts ~cycle(target_data_ts ),
main = target_column ,
xlab = 'month',
ylab = target_column )
len = length(target_data)
data = target_data[1:(len-forecast_num)]
data_ts = ts(data,start = c(2000,1),freq = 12)
test = target_data[(len-forecast_num+1):len]
test_ts = ts(test,end = c(2019,2),freq = 12)
model_lm = lm(data_ts~time(data_ts))
summary(model_lm)
##
## Call:
## lm(formula = data_ts ~ time(data_ts))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4578.8 -989.7 -69.1 971.6 5865.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.626e+06 4.229e+04 -62.10 <2e-16 ***
## time(data_ts) 1.315e+03 2.105e+01 62.48 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1608 on 214 degrees of freedom
## Multiple R-squared: 0.948, Adjusted R-squared: 0.9478
## F-statistic: 3904 on 1 and 214 DF, p-value: < 2.2e-16
plot(data_ts, main = 'Linear model with intercept');abline(reg = model_lm,col = 'red')
## original : for follow-up analysis
res_lm = residuals(model_lm)
plot(y=res_lm ,x=as.vector(time(data_ts)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals : LM de-trend')
## standard : observe outliers (some issues sor events)
std_res_lm = ts(rstudent(model_lm),
start = c(2000,1),freq = 12)
plot(y=std_res_lm ,x=as.vector(time(std_res_lm)),
xlab='Time',ylab='Standardized Residuals',type='o',
main = 'Satndard residuals : LM de-trend');abline(a = 3,b = 0, col = 'red', lty = 2);abline(a = -3,b = 0, col = 'red', lty = 2);abline(a = 2,b = 0, col = 'orange', lty = 2);abline(a = -2,b = 0, col = 'orange', lty = 2);abline(a = 0,b = 0, col = 'grey', lty = 2)
hist(std_res_lm ,
breaks = 30,
xlab='Standardized Residuals',
main = 'Residuals Histogram')
qqnorm(std_res_lm , main = 'QQ plot');qqline(std_res_lm , col = 'red')
ks.test(std_res_lm ,"pnorm")
##
## One-sample Kolmogorov-Smirnov test
##
## data: std_res_lm
## D = 0.034903, p-value = 0.955
## alternative hypothesis: two-sided
acf(res_lm, main = 'ACF of Residuals', lag.max = 60)
After de-trend process, residuals are considered to be normal distribution , but not independent random variable. Therefore, it’s clearly proper to treat the data as a time series.
model_lm_woi = lm(data_ts~time(data_ts)-1)
summary(model_lm_woi)
##
## Call:
## lm(formula = data_ts ~ time(data_ts) - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13052 -4927 -1406 6524 13637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## time(data_ts) 7.9699 0.2369 33.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6995 on 215 degrees of freedom
## Multiple R-squared: 0.8403, Adjusted R-squared: 0.8396
## F-statistic: 1132 on 1 and 215 DF, p-value: < 2.2e-16
plot(data_ts, main='Linear Model w.o. intercept');abline(reg = model_lm_woi, col = 'red')
plot(data_ts, main='Linear Model w.o. intercept (start at (0,0))',
xlim = c(0,2018), ylim = c(0,26000));abline(reg = model_lm_woi, col = 'red')
According to the plot above, the linear model without intercept may not proper to fit this data due to the restriction of passing through origin coordinates.
Attention : We continue the seasonal analysis with residual data from de-trend process instead of original data.
season_data = res_lm
season_data_ts = ts(season_data,start = c(2000,1),freq = 12)
plot(y=season_data_ts ,x=as.vector(time(season_data_ts )),
xlab='Time',ylab='Residuals',type='o',
main = paste0('Residuals of ',target_column,' after de-trend'))
boxplot(season_data_ts~cycle(season_data_ts),
main = paste0('Residual of ',
target_column ,
' after de-trend'),
xlab = 'month',
ylab = 'Residual')
It’s obviously that the occurrence of motor vehicle accident in February is lower than other months. Since means of twelves months are not equal,the seasonal effect should’nt be ignored.
mon=season(season_data_ts)
model_season = lm(season_data_ts~mon-1)
summary(model_season)
##
## Call:
## lm(formula = season_data_ts ~ mon - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4548.1 -976.3 73.0 927.8 4433.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## monJanuary 1086.50 331.48 3.278 0.00123 **
## monFebruary -1879.83 331.48 -5.671 4.81e-08 ***
## monMarch -78.44 331.48 -0.237 0.81318
## monApril -691.49 331.48 -2.086 0.03821 *
## monMay 206.12 331.48 0.622 0.53476
## monJune -46.82 331.48 -0.141 0.88780
## monJuly 232.18 331.48 0.700 0.48446
## monAugust -520.93 331.48 -1.572 0.11761
## monSeptember -576.88 331.48 -1.740 0.08331 .
## monOctober 498.68 331.48 1.504 0.13402
## monNovember 339.51 331.48 1.024 0.30693
## monDecember 1431.41 331.48 4.318 2.45e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1406 on 204 degrees of freedom
## Multiple R-squared: 0.2706, Adjusted R-squared: 0.2277
## F-statistic: 6.306 on 12 and 204 DF, p-value: 1.87e-09
model_season_fit = ts(fitted(model_season),start = c(2000,1),freq = 12)
plot(model_season_fit ,
main = 'Seasonal model w.o. intercept',
col = 'red',
ylim = c(min(season_data_ts),max(season_data_ts)));points(season_data_ts, col = 'black')
## residual
res_season = residuals(model_season)
res_season_ts = ts(res_season,
start = c(2000,1),
freq = 12)
plot(y=res_season_ts ,x=as.vector(time(res_season_ts)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals : de-trend & de-season')
### standard residual
std_res_season = rstudent(model_season)
plot(y=std_res_season ,x=as.vector(time(std_res_season)),
xlab='Time',ylab='Standardized Residuals',type='o',
main = 'Satndard Residuals : de-trend & de-season');abline(a = 3,b = 0, col = 'red', lty = 2);abline(a = -3,b = 0, col = 'red', lty = 2);abline(a = 2,b = 0, col = 'orange', lty = 2);abline(a = -2,b = 0, col = 'orange', lty = 2);abline(a = 0,b = 0, col = 'grey', lty = 2)
hist(std_res_season ,
breaks = 30,
xlab='Standardized Residuals',
main = 'Residuals Histogram')
qqnorm(std_res_season , main = 'QQ plot');qqline(std_res_lm , col = 'red')
ks.test(std_res_season ,"pnorm")
##
## One-sample Kolmogorov-Smirnov test
##
## data: std_res_season
## D = 0.037843, p-value = 0.9165
## alternative hypothesis: two-sided
According to the report, Jan, Feb, Apr and Dec are significant months in seasonal model; On the other side ,the plot of fitted values showes that the seasonal model still doesn’t fully explain the data.
After de-trend and de-season process, now lets check the acf and pacf and try to fit ARMA model; By the way, interestingly, ks-test still doesn’t reject the hypothesis of normal-distribution assumption.
month_=season(season_data_ts)
model_season_wi=lm(season_data_ts~month_)
summary(model_season_wi)
##
## Call:
## lm(formula = season_data_ts ~ month_)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4548.1 -976.3 73.0 927.8 4433.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1086.5 331.5 3.278 0.001230 **
## month_February -2966.3 468.8 -6.328 1.55e-09 ***
## month_March -1164.9 468.8 -2.485 0.013758 *
## month_April -1778.0 468.8 -3.793 0.000196 ***
## month_May -880.4 468.8 -1.878 0.061804 .
## month_June -1133.3 468.8 -2.418 0.016502 *
## month_July -854.3 468.8 -1.822 0.069853 .
## month_August -1607.4 468.8 -3.429 0.000733 ***
## month_September -1663.4 468.8 -3.548 0.000481 ***
## month_October -587.8 468.8 -1.254 0.211301
## month_November -747.0 468.8 -1.593 0.112604
## month_December 344.9 468.8 0.736 0.462732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1406 on 204 degrees of freedom
## Multiple R-squared: 0.2706, Adjusted R-squared: 0.2312
## F-statistic: 6.879 on 11 and 204 DF, p-value: 7.843e-10
model_season_wi_fit = ts(fitted(model_season_wi),start = c(2000,1),freq = 12)
ts.plot(model_season_fit,model_season_wi_fit,col = c(2,4),lwd = c(2,2),lty = c(2,3), main = 'Seasonal Means Model',
ylim = c(min(season_data_ts),max(season_data_ts)));points(season_data_ts, col = 'black')
Otherwise, it excludes the January effect in seasonal model with intercept, but fitted values are totally the same with ones without intercept; for the sake, the following analysis output will not differ with each seasonal model result.
har =harmonic(season_data_ts, 6)
model_har = lm(season_data_ts~har)
summary(model_har)
##
## Call:
## lm(formula = season_data_ts ~ har)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4548.1 -976.3 73.0 927.8 4433.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.199e-10 9.569e+01 0.000 1.000000
## harcos(2*pi*t) 2.123e+02 1.353e+02 1.569 0.118303
## harcos(4*pi*t) 1.764e+02 1.353e+02 1.303 0.193932
## harcos(6*pi*t) 3.708e+01 1.353e+02 0.274 0.784346
## harcos(8*pi*t) 2.815e+02 1.353e+02 2.080 0.038782 *
## harcos(10*pi*t) 1.778e+02 1.353e+02 1.314 0.190326
## harcos(12*pi*t) 2.015e+02 9.569e+01 2.106 0.036449 *
## harsin(2*pi*t) -3.821e+02 1.353e+02 -2.824 0.005220 **
## harsin(4*pi*t) -7.197e+02 1.353e+02 -5.318 2.74e-07 ***
## harsin(6*pi*t) -2.745e+02 1.353e+02 -2.028 0.043821 *
## harsin(8*pi*t) -3.730e+02 1.353e+02 -2.757 0.006372 **
## harsin(10*pi*t) -4.875e+02 1.353e+02 -3.602 0.000396 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1406 on 204 degrees of freedom
## Multiple R-squared: 0.2706, Adjusted R-squared: 0.2312
## F-statistic: 6.879 on 11 and 204 DF, p-value: 7.843e-10
model_har_fit = ts(fitted(model_har),
start = c(2000,1),freq = 12)
ts.plot(season_data_ts, lty = c(1,3), col=c(1,4),
main = 'Harmonic(6)',
ylim = c(min(model_har_fit,season_data_ts),
max(model_har_fit,season_data_ts)));points(season_data_ts);lines(model_har_fit,col="red")
arma_data = res_season
arma_data_ts = ts(arma_data,start = c(2000,1),freq = 12)
plot(y=arma_data_ts ,x=as.vector(time(arma_data_ts)),
xlab='Time',ylab='Residuals',type='o',
main = paste0('Residuals of ',
target_column ,
' after de-trend and de-season'))
acf(arma_data, lag = 60, main = 'ACF of Residuals after de-trend and de season')
pacf(arma_data, lag = 60, main = 'PACF of Residuals after de-trend and de season')
The tail-off ACF plot with cuts-off PACF plot implies the AR(p) model. Because of cuts-off after lag 1 in PACF plot, we first give AR(1) or SARMA(1,0)(1,0)[6] a try.
model_ar1 = Arima(arma_data_ts, order = c(1,0,0))
summary(model_ar1)
## Series: arma_data_ts
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## 0.8025 -96.8391
## s.e. 0.0420 284.3951
##
## sigma^2 estimated as 707878: log likelihood=-1760.77
## AIC=3527.53 AICc=3527.64 BIC=3537.66
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 8.801504 837.4505 649.3579 40.606 230.7969 0.6335582
## ACF1
## Training set -0.2102008
model_ar1_fit = fitted(model_ar1)
ts.plot(arma_data_ts, model_ar1_fit,col = c(1,2),lwd = c(2,1),lty = c(2,1), main = 'AR(1) Model',
ylim = c(min(arma_data_ts),max(arma_data_ts)));points(arma_data_ts, col = 'black')
## residual
res_ar1 = residuals(model_ar1) ## model_ar1$residuals
res_ar1_ts = ts(res_ar1, start = c(2000,1), freq = 12)
plot(y=res_ar1_ts ,x=as.vector(time(res_ar1_ts)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals of AR(1) after de-trend & de-season')
## standard residual
## rstudent not work to Arima object, but I found rstandard's output not equal to rstudent with seasonal model
std_res_ar1 = rstandard(model_ar1)
plot(y=std_res_ar1 ,x=as.vector(time(std_res_ar1)),
xlab='Time',ylab='Standardized Residuals',type='o',
main = 'Satndard Residuals of AR(1) after de-trend & de-season');abline(a = 3,b = 0, col = 'red', lty = 2);abline(a = -3,b = 0, col = 'red', lty = 2);abline(a = 2,b = 0, col = 'orange', lty = 2);abline(a = -2,b = 0, col = 'orange', lty = 2);abline(a = 0,b = 0, col = 'grey', lty = 2)
hist(std_res_ar1 ,
breaks = 30,
xlab='Standardized Residuals',
main = 'Residuals Histogram')
qqnorm(std_res_ar1 , main = 'QQ plot');qqline(std_res_ar1 , col = 'red')
ks.test(std_res_ar1 ,"pnorm")
##
## One-sample Kolmogorov-Smirnov test
##
## data: std_res_ar1
## D = 0.043488, p-value = 0.8086
## alternative hypothesis: two-sided
acf(as.vector(res_ar1), lag = 60, main = 'ACF : Residuals of AR(1)')
pacf(as.vector(res_ar1), lag = 60, main = 'PACF : Residuals of AR(1)')
model_ar1010 = Arima(arma_data_ts,
order = c(1,0,0),
seasonal = list(order = c(1,0,0),
period = 6))
summary(model_ar1010)
## Series: arma_data_ts
## ARIMA(1,0,0)(1,0,0)[6] with non-zero mean
##
## Coefficients:
## ar1 sar1 mean
## 0.7119 0.3859 -132.1882
## s.e. 0.0540 0.0729 294.3333
##
## sigma^2 estimated as 625227: log likelihood=-1747.22
## AIC=3502.45 AICc=3502.63 BIC=3515.95
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 9.343143 785.2024 606.2786 52.1924 211.0902 0.5915271
## ACF1
## Training set -0.203691
model_ar1010_fit = fitted(model_ar1010)
ts.plot(arma_data_ts, model_ar1010_fit,col = c(1,2),lwd = c(2,1),lty = c(2,1), main = 'SARIMA(1,0,0)(1,0,0)[6] Model',
ylim = c(min(arma_data_ts),max(arma_data_ts)));points(arma_data_ts, col = 'black')
## residual
res_ar1010 = residuals(model_ar1010) ## model_ar1$residuals
res_ar1010_ts = ts(res_ar1010, start = c(2000,1), freq = 12)
plot(y=res_ar1010_ts ,x=as.vector(time(res_ar1010_ts)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals of SARIMA(1,0,0)(1,0,0)[6]')
## standard residual
## rstudent not work to Arima object, but I found rstandard's output not equal to rstudent with seasonal model
std_res_ar1010 = rstandard(model_ar1010)
plot(y=std_res_ar1010 ,x=as.vector(time(std_res_ar1010)),
xlab='Time',ylab='Standardized Residuals',type='o',
main = 'Satndard Residuals of SARIMA(1,0,0)(1,0,0)[6]');abline(a = 3,b = 0, col = 'red', lty = 2);abline(a = -3,b = 0, col = 'red', lty = 2);abline(a = 2,b = 0, col = 'orange', lty = 2);abline(a = -2,b = 0, col = 'orange', lty = 2);abline(a = 0,b = 0, col = 'grey', lty = 2)
hist(std_res_ar1010 ,
breaks = 30,
xlab='Standardized Residuals',
main = 'Residuals Histogram')
qqnorm(std_res_ar1010 , main = 'QQ plot');qqline(std_res_ar1010 , col = 'red')
ks.test(std_res_ar1010 ,"pnorm")
##
## One-sample Kolmogorov-Smirnov test
##
## data: std_res_ar1010
## D = 0.052486, p-value = 0.5913
## alternative hypothesis: two-sided
acf(as.vector(res_ar1010), lag = 60, main = 'ACF : Residuals of SARIMA(1,0,0)(1,0,0)[6]')
pacf(as.vector(res_ar1010), lag = 60, main = 'PACF : Residuals of SARIMA(1,0,0)(1,0,0)[6]')
eacf(arma_data)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x x x x
## 1 x o o x o x o o o x o x o o
## 2 x o o x o x o o o x o x o o
## 3 o o o x o x o o o o o x o o
## 4 o x x x o x o o o o o x o o
## 5 x x x x x o x o o o o x o o
## 6 x x x o x x o o o o o o o o
## 7 x x x o o x o o o o o o o o
# eacf_info = eacf(arma_data)
# eacf_sig = eacf_info$eacf
# eacf_sig < 0.05
By EACF matrix, ARMA(1,1) is our candidate for modeling.
Attention: Because we early import deterministic trend and seasonal model; to prevent it from stochastic effect, we set zero to parameters d, D, max.P and max.Q.
auto_arima = auto.arima(arma_data_ts,
d = 0, D = 0,
trace = T)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2)(1,0,1)[12] with non-zero mean : 3471.228
## ARIMA(0,0,0) with non-zero mean : 3736.151
## ARIMA(1,0,0)(1,0,0)[12] with non-zero mean : 3500.003
## ARIMA(0,0,1)(0,0,1)[12] with non-zero mean : 3593.304
## ARIMA(0,0,0) with zero mean : 3734.114
## ARIMA(2,0,2)(0,0,1)[12] with non-zero mean : 3482.919
## ARIMA(2,0,2)(1,0,0)[12] with non-zero mean : 3477.057
## ARIMA(2,0,2)(2,0,1)[12] with non-zero mean : 3481.449
## ARIMA(2,0,2)(1,0,2)[12] with non-zero mean : 3473.296
## ARIMA(2,0,2) with non-zero mean : 3506.728
## ARIMA(2,0,2)(0,0,2)[12] with non-zero mean : 3479.29
## ARIMA(2,0,2)(2,0,0)[12] with non-zero mean : 3480.486
## ARIMA(2,0,2)(2,0,2)[12] with non-zero mean : 3483.594
## ARIMA(1,0,2)(1,0,1)[12] with non-zero mean : 3469.97
## ARIMA(1,0,2)(0,0,1)[12] with non-zero mean : 3485.609
## ARIMA(1,0,2)(1,0,0)[12] with non-zero mean : 3476.579
## ARIMA(1,0,2)(2,0,1)[12] with non-zero mean : 3479.809
## ARIMA(1,0,2)(1,0,2)[12] with non-zero mean : 3472.011
## ARIMA(1,0,2) with non-zero mean : 3510.372
## ARIMA(1,0,2)(0,0,2)[12] with non-zero mean : 3480.108
## ARIMA(1,0,2)(2,0,0)[12] with non-zero mean : 3479.762
## ARIMA(1,0,2)(2,0,2)[12] with non-zero mean : 3481.963
## ARIMA(0,0,2)(1,0,1)[12] with non-zero mean : 3545.425
## ARIMA(1,0,1)(1,0,1)[12] with non-zero mean : 3467.862
## ARIMA(1,0,1)(0,0,1)[12] with non-zero mean : 3483.676
## ARIMA(1,0,1)(1,0,0)[12] with non-zero mean : 3474.48
## ARIMA(1,0,1)(2,0,1)[12] with non-zero mean : 3477.662
## ARIMA(1,0,1)(1,0,2)[12] with non-zero mean : 3469.874
## ARIMA(1,0,1) with non-zero mean : 3510.482
## ARIMA(1,0,1)(0,0,2)[12] with non-zero mean : 3477.973
## ARIMA(1,0,1)(2,0,0)[12] with non-zero mean : 3477.657
## ARIMA(1,0,1)(2,0,2)[12] with non-zero mean : 3479.794
## ARIMA(0,0,1)(1,0,1)[12] with non-zero mean : 3594.15
## ARIMA(1,0,0)(1,0,1)[12] with non-zero mean : 3493.262
## ARIMA(2,0,1)(1,0,1)[12] with non-zero mean : 3469.25
## ARIMA(0,0,0)(1,0,1)[12] with non-zero mean : 3666.319
## ARIMA(2,0,0)(1,0,1)[12] with non-zero mean : 3469.661
## ARIMA(1,0,1)(1,0,1)[12] with zero mean : 3466.238
## ARIMA(1,0,1)(0,0,1)[12] with zero mean : 3481.773
## ARIMA(1,0,1)(1,0,0)[12] with zero mean : 3472.697
## ARIMA(1,0,1)(2,0,1)[12] with zero mean : 3475.901
## ARIMA(1,0,1)(1,0,2)[12] with zero mean : 3468.242
## ARIMA(1,0,1) with zero mean : 3508.494
## ARIMA(1,0,1)(0,0,2)[12] with zero mean : 3476.181
## ARIMA(1,0,1)(2,0,0)[12] with zero mean : 3475.781
## ARIMA(1,0,1)(2,0,2)[12] with zero mean : 3478.018
## ARIMA(0,0,1)(1,0,1)[12] with zero mean : 3592.363
## ARIMA(1,0,0)(1,0,1)[12] with zero mean : 3491.253
## ARIMA(2,0,1)(1,0,1)[12] with zero mean : 3467.794
## ARIMA(1,0,2)(1,0,1)[12] with zero mean : 3468.353
## ARIMA(0,0,0)(1,0,1)[12] with zero mean : 3664.864
## ARIMA(0,0,2)(1,0,1)[12] with zero mean : 3543.6
## ARIMA(2,0,0)(1,0,1)[12] with zero mean : 3467.91
## ARIMA(2,0,2)(1,0,1)[12] with zero mean : 3469.932
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(1,0,1)(1,0,1)[12] with zero mean : 3471.867
##
## Best model: ARIMA(1,0,1)(1,0,1)[12] with zero mean
auto_parameters = auto_arima$arma #(p,q,P,Q,s,d,D)
By auto.arima function in “forecast” package, the best model is SARIMA(1,0,1)(1,0,1)[12] with zero mean
So, we have three candidate: a. ARMA(1,1) b. SARMA(1,0,1)(1,0,1)[12]
## ARMA(1,1)
model_arma11 = Arima(arma_data_ts, order = c(1,0,1))
model_arma11_fit = fitted(model_arma11)
res_arma11 = residuals(model_arma11)
std_res_arma11 = rstandard(model_arma11)
## auto arima
auto_p = auto_parameters[1]
auto_q = auto_parameters[2]
auto_P = auto_parameters[3]
auto_Q = auto_parameters[4]
auto_s = auto_parameters[5]
auto_d = auto_parameters[6]
auto_D = auto_parameters[7]
model_arma_auto = Arima(arma_data_ts,
order = c(auto_p,auto_d,auto_q),
seasonal = list(
order = c(auto_P,auto_D,auto_Q),
period = auto_s)
)
model_arma_auto_fit = fitted(model_arma_auto)
res_arma_auto = residuals(model_arma_auto)
std_res_arma_auto = rstandard(model_arma_auto)
model_form = paste0("ARIMA(",
auto_p,",",
auto_d,",",
auto_q,")(",
auto_P,",",
auto_D,",",
auto_Q,")[",
auto_s,"]")
## plot
ts.plot(arma_data_ts,
model_arma11_fit,
model_arma_auto_fit,
col = c(1,2,4),
lwd = c(1,2,2),
lty = c(1,2,3),
main = paste0('ARMA(1,1) and ',model_form,'Model'),
ylim = c(min(arma_data_ts),
max(arma_data_ts)))
# ;points(
# arma_data_ts, col = 'black');legend(
# 'top',
# legend = c('data','arma11',
# paste0(model_form)),
# col = c(1,2,4),lty = 1,cex = 0.5
# )
plot(y=std_res_arma11 ,
x=as.vector(time(std_res_arma11)),
xlab='Time',
ylab='Standardized Residuals',
type='o',
main = 'Satndard Residuals of ARMA(1,1) after de-trend & de-season');abline(a = 3,b = 0,
col = 'red', lty = 2);abline(a = -3,b = 0,
col = 'red', lty = 2);abline(a = 2,b = 0,
col = 'orange', lty = 2);abline(a = -2,b = 0,
col = 'orange', lty = 2);abline(a = 0,b = 0,
col = 'grey', lty = 2)
plot(y=std_res_arma_auto,
x=as.vector(time(std_res_arma_auto)),
ylim = c(-3.5, 3.5),
xlab='Time',
ylab='Standardized Residuals',
type='o',
main = paste0('Satndard Residuals of ARMA(',
auto_p,',',auto_q,
') after de-trend & de-season'));abline(a = 3,b = 0,
col = 'red', lty = 2);abline(a = -3,b = 0,
col = 'red', lty = 2);abline(a = 2,b = 0,
col = 'orange', lty = 2);abline(a = -2,b = 0,
col = 'orange', lty = 2);abline(a = 0,b = 0,
col = 'grey', lty = 2)
acf(as.vector(res_arma11), lag = 60, main = 'ACF : Residuals of ARMA(1,1)')
pacf(as.vector(res_arma11), lag = 60, main = 'PACF : Residuals of ARMA(1,1)')
acf(as.vector(res_arma_auto), lag = 60,
main = paste0('ACF : Residuals of ARMA(',auto_p,',',auto_q,')'))
pacf(as.vector(res_arma_auto), lag = 60,
main = paste0('PACF : Residuals of ARMA(',auto_p,',',auto_q,')'))
Consider that the residuals had better be whote noise, neither ARMA(1,1) nor ARMA(5,2) meets the ideal result because of the peaks with lag 12 on ACF plots, which means it’s definitely not independent white noise.Therefore, lets give stochastic method a try.
H0: non-stationary with lag k H1: stationary with lag k
## stationary
adf.test(data_ts, alternative = c("stationary"),k = 1)
## Warning in adf.test(data_ts, alternative = c("stationary"), k = 1): p-value
## smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: data_ts
## Dickey-Fuller = -5.156, Lag order = 1, p-value = 0.01
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 2)
## Warning in adf.test(data_ts, alternative = c("stationary"), k = 2): p-value
## smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: data_ts
## Dickey-Fuller = -4.4843, Lag order = 2, p-value = 0.01
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 3)
## Warning in adf.test(data_ts, alternative = c("stationary"), k = 3): p-value
## smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: data_ts
## Dickey-Fuller = -4.0484, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
## non stationary
adf.test(data_ts, alternative = c("stationary"),k = 4)
##
## Augmented Dickey-Fuller Test
##
## data: data_ts
## Dickey-Fuller = -2.3887, Lag order = 4, p-value = 0.4129
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 5)
##
## Augmented Dickey-Fuller Test
##
## data: data_ts
## Dickey-Fuller = -1.5209, Lag order = 5, p-value = 0.777
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 6)
##
## Augmented Dickey-Fuller Test
##
## data: data_ts
## Dickey-Fuller = -1.5549, Lag order = 6, p-value = 0.7627
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 12)
##
## Augmented Dickey-Fuller Test
##
## data: data_ts
## Dickey-Fuller = -0.56348, Lag order = 12, p-value = 0.9782
## alternative hypothesis: stationary
Althogh the above shows outputs of adf.test, I don’t really need it and won’t interpret the test.
acf(data_ts, lag = 60)
pacf(data_ts, lag = 60)
The tail-off acf, nearly equal to 1.0 starting at lag 1, strongly implies the stochastic trend effect. So I try to import non-seasonal part of the arima model, which means lets take difference with lag 1.
## method I : arima model with d = 1
model_stoch_trend = Arima(data_ts, order = c(0,1,0))
model_stoch_trend_fit = fitted(model_stoch_trend)
ts.plot(data_ts,model_stoch_trend_fit ,
col = c(1,2),lwd = c(1,2),lty = c(1,2),
main = 'Stochastic Trend Model',
ylim = c(min(data_ts),max(data_ts)))
res_stoch_trend = residuals(model_stoch_trend)
## method II : diff
diff(data_ts)==residuals(model_stoch_trend)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## 2000 FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2001 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2002 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2003 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2004 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2005 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2006 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2007 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2008 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2009 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2010 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2011 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2012 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2013 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2014 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2015 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2016 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2017 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## Dec
## 2000 TRUE
## 2001 TRUE
## 2002 TRUE
## 2003 TRUE
## 2004 TRUE
## 2005 TRUE
## 2006 TRUE
## 2007 TRUE
## 2008 TRUE
## 2009 TRUE
## 2010 TRUE
## 2011 TRUE
## 2012 TRUE
## 2013 TRUE
## 2014 TRUE
## 2015 TRUE
## 2016 TRUE
## 2017 TRUE
acf(as.vector(res_stoch_trend), lag = 60)
pacf(as.vector(res_stoch_trend), lag = 60)
Compare determinist linear model to stochastic trend model, I think it’s better to choose the latter one because I noticed that acf after difference shows repeatedly high auto-correlation with lag 12. It may help me more to interpret the further stochastic seasonal model.
stoch_season_data = res_stoch_trend
## method I : arima model with D=1 and period=12
model_stoch_season = Arima(stoch_season_data,
seasonal = list(order = c(0,1,0), period = 12))
model_stoch_season_fit = fitted(model_stoch_season)
ts.plot(stoch_season_data,
model_stoch_season_fit ,
col = c(1,2),lwd = c(1,1),lty = c(1,2),
main = 'Stochastic Season Model',
ylim = c(min(stoch_season_data),
max(stoch_season_data)))
res_stoch_season = residuals(model_stoch_season)
## method 2 : diff with lag 12
diff(stoch_season_data,lag = 12)==residuals(model_stoch_season)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## 2001 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2002 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2003 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2004 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2005 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2006 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2007 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2008 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2009 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2010 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2011 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2012 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2013 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2014 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2015 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2016 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2017 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## Dec
## 2001 FALSE
## 2002 FALSE
## 2003 TRUE
## 2004 TRUE
## 2005 TRUE
## 2006 TRUE
## 2007 TRUE
## 2008 TRUE
## 2009 TRUE
## 2010 TRUE
## 2011 TRUE
## 2012 TRUE
## 2013 TRUE
## 2014 TRUE
## 2015 TRUE
## 2016 TRUE
## 2017 TRUE
## if you dont import stochastic season effect
# stoch_arma_data = res_stoch_trend
stoch_arma_data = res_stoch_season
plot(y=stoch_arma_data ,
x=as.vector(time(stoch_arma_data)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals after de-trend and de-season')
acf(as.vector(stoch_arma_data), lag = 60,
main = 'ACF of Residuals after de-trend and de season')
pacf(as.vector(stoch_arma_data), lag = 60,
main = 'PACF of Residuals after de-trend and de season')
With low and irregular acf and pacf, it too hard for me to determine ARMA parameters p and q by my poor experience.
It can only partially explain non-seasonal parameters p,q
eacf(stoch_arma_data)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x o o x o o x x x o x x x o
## 1 x x o x o o o x x o o x o o
## 2 x o x o o o o o o o o x o o
## 3 x x x o o o o o o o o x o o
## 4 x x x o x o o o o o o x o o
## 5 x x o x x o o o o o o x o o
## 6 x o o x o o o o o o o x o o
## 7 x o x x o o o o o o o x o o
ARMA(0,1) may be a proper choice
auto_arima = auto.arima(stoch_arma_data,
d = 0, D = 0,
trace = T)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2)(1,0,1)[12] with non-zero mean : 3488.884
## ARIMA(0,0,0) with non-zero mean : 3581.087
## ARIMA(1,0,0)(1,0,0)[12] with non-zero mean : 3504.55
## ARIMA(0,0,1)(0,0,1)[12] with non-zero mean : 3467.868
## ARIMA(0,0,0) with zero mean : 3579.179
## ARIMA(0,0,1) with non-zero mean : 3522.59
## ARIMA(0,0,1)(1,0,1)[12] with non-zero mean : 3482.222
## ARIMA(0,0,1)(0,0,2)[12] with non-zero mean : 3469.939
## ARIMA(0,0,1)(1,0,0)[12] with non-zero mean : 3500.84
## ARIMA(0,0,1)(1,0,2)[12] with non-zero mean : Inf
## ARIMA(0,0,0)(0,0,1)[12] with non-zero mean : 3510.307
## ARIMA(1,0,1)(0,0,1)[12] with non-zero mean : 3470.948
## ARIMA(0,0,2)(0,0,1)[12] with non-zero mean : 3469.961
## ARIMA(1,0,0)(0,0,1)[12] with non-zero mean : 3473.223
## ARIMA(1,0,2)(0,0,1)[12] with non-zero mean : 3472.868
## ARIMA(0,0,1)(0,0,1)[12] with zero mean : 3466.88
## ARIMA(0,0,1) with zero mean : 3520.959
## ARIMA(0,0,1)(1,0,1)[12] with zero mean : 3481.284
## ARIMA(0,0,1)(0,0,2)[12] with zero mean : 3468.933
## ARIMA(0,0,1)(1,0,0)[12] with zero mean : 3499.465
## ARIMA(0,0,1)(1,0,2)[12] with zero mean : Inf
## ARIMA(0,0,0)(0,0,1)[12] with zero mean : 3508.579
## ARIMA(1,0,1)(0,0,1)[12] with zero mean : 3469.923
## ARIMA(0,0,2)(0,0,1)[12] with zero mean : 3468.938
## ARIMA(1,0,0)(0,0,1)[12] with zero mean : 3471.799
## ARIMA(1,0,2)(0,0,1)[12] with zero mean : 3471.794
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(0,0,1)(0,0,1)[12] with zero mean : 3472.036
##
## Best model: ARIMA(0,0,1)(0,0,1)[12] with zero mean
auto_parameters = auto_arima$arma #(p,q,P,Q,s,d,D)
auto_p = auto_parameters[1]
auto_q = auto_parameters[2]
auto_P = auto_parameters[3]
auto_Q = auto_parameters[4]
auto_s = auto_parameters[5]
auto_d = auto_parameters[6]
auto_D = auto_parameters[7]
According to the auto.arima, the best model is SARIMA(p=0,d=0,q=1)(P=0,D=0,Q=1) with lag 12 and without drift.
model_stoch_arma_auto = Arima(stoch_arma_data,
order = c(auto_p,auto_d,auto_q),
seasonal = list(
order = c(auto_P,auto_D,auto_Q),
period = auto_s)
)
model_stoch_arma_auto_fit = (stoch_arma_data - model_stoch_arma_auto$residuals)
#model_stoch_arma_auto_fit == fitted(model_stoch_arma_auto)
res_stoch_arma_auto = residuals(model_stoch_arma_auto)
std_res_stoch_arma_auto = rstandard(model_stoch_arma_auto)
# res_stoch_arma_auto == model_stoch_arma_auto$residuals
ts.plot(stoch_arma_data,
model_stoch_arma_auto_fit ,
col = c(1,2),lwd = c(1,1),lty = c(1,2),
main = 'SARMA Model',
ylim = c(min(stoch_arma_data),
max(stoch_arma_data)))
# plot(res_stoch_arma_auto) ## just check
plot(y=std_res_stoch_arma_auto,
x=as.vector(time(std_res_stoch_arma_auto)),
xlab='Time',
ylab='Standardized Residuals',
type='o',
main = 'Satndard Residuals of ARMA after de-trend & de-season');abline(a = 3,b = 0,
col = 'red', lty = 2);abline(a = -3,b = 0,
col = 'red', lty = 2);abline(a = 2,b = 0,
col = 'orange', lty = 2);abline(a = -2,b = 0,
col = 'orange', lty = 2);abline(a = 0,b = 0,
col = 'grey', lty = 2)
acf(as.vector(res_stoch_arma_auto),
lag = 60, main = 'ACF : Residuals of SARMA')
pacf(as.vector(res_stoch_arma_auto),
lag = 60, main = 'PACF : Residuals of SARMA')
Before prediction, let’s summarise the model info. Firstly, we apply linear model or difference into the de-trend process; secondly, we apply seasonal model or difference with a lag into the de-season process.After trend and seasonal effects are removed, we established an ARMA or a Seasonal ARMA model to fit the data and expect that the residuals could be white noise.
Therefore, the whole fitting model of Determinist Method would contain three parts : ARMA, Seasonal Model, Trend Model \[ (1-\Sigma_{i}^{p}{\phi_i B_i})Y_t = (1-\Sigma_{j}^{q}{\theta_j B^j})e_t + (\Sigma{\gamma_k m_k}) + (\beta_1 t + \beta_0)\] \[ B : lag \space operator \] \[p,q : non-seasonal \space ARMA \space parameter\]
\[m_i : dummy \space variable \space of \space seasonal\space effect\]
\[\gamma : coefficient \space of \space season\space effect\]
\[\beta : coefficient \space of \space trend\space effect\]
The Whole Stochastic Model would be like: \[ (1-B)^d(1-B^s)^D(1-\Sigma_{i}^{p}{\phi_i B^i})(1-\Sigma_{j}^{P}{\Phi_j B^{js}})Y_t = (1-\Sigma_{k}^{q}{\theta_k B^k})(1-\Sigma_{l}^{Q}{\theta_l B^{ls}})e_t \]
\[ B : lag \space operator \] \[p,d,q : nonseasonal \space ARMA \space parameter\] \[P,D,Q,s : seasonal \space ARMA \space parameter\]
The whole stochastic model go through once difference for de-trend, once defference with lag 12 for de-season,and fitting a seasonal ARMA(0,0,1)(0,0,1)[12] model.Now we can simplify the whole model as a SARIMA(0,1,1)(0,1,1)[12]
model_sarima = Arima(y = data_ts,
order = c(0,1,1),
seasonal = list(order = c(0,1,1),
period = 12))
model_sarima_fit = (data_ts - model_sarima$residuals)
res_sarima = model_sarima$residuals
## I don't know why the residual not the same
res_sarima == res_stoch_arma_auto
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## 2000 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2001 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2002 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2003 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2004 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2005 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2006 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2007 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2008 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2009 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2010 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2011 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2012 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2013 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2014 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2015 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2016 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2017 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## Dec
## 2000 FALSE
## 2001 FALSE
## 2002 FALSE
## 2003 FALSE
## 2004 FALSE
## 2005 FALSE
## 2006 FALSE
## 2007 FALSE
## 2008 FALSE
## 2009 FALSE
## 2010 FALSE
## 2011 FALSE
## 2012 FALSE
## 2013 FALSE
## 2014 FALSE
## 2015 FALSE
## 2016 FALSE
## 2017 FALSE
res_sarima - res_stoch_arma_auto
## Jan Feb Mar Apr May
## 2000 -7.388421 -13.379831 -16.754230 -18.460976 -19.258907
## 2001 91.913477 32.066886 -63.148777 -8.237246 19.998097
## 2002 -31.211981 -50.492115 -104.884810 -36.638362 -6.046498
## 2003 -112.714044 -81.221772 -100.937725 -62.623669 -33.354676
## 2004 -99.269968 -52.008779 -85.536683 -52.234578 -33.824281
## 2005 -71.580000 -54.517753 -57.747440 -51.833419 -48.655191
## 2006 -64.095510 -45.304412 -54.931416 -46.106965 -43.176203
## 2007 -56.167327 -48.178956 -56.390167 -48.721637 -55.157659
## 2008 -53.152595 -55.272543 -56.456240 -54.503590 -54.563919
## 2009 -68.732052 -54.532725 -58.345299 -53.764879 -56.640961
## 2010 -57.138055 -65.181011 -59.594780 -68.418156 -62.871350
## 2011 -59.901603 -54.351825 -48.122887 -51.521428 -53.675621
## 2012 -60.158533 -50.373032 -64.422283 -62.102258 -52.441293
## 2013 -60.494723 -64.290823 -51.392051 -55.501599 -47.211391
## 2014 -65.901702 -62.926529 -46.761600 -39.905645 -57.056021
## 2015 -66.751615 -38.811912 -36.728254 -43.943805 -48.706085
## 2016 -60.325330 -50.588472 -57.563262 -55.169705 -47.273278
## 2017 -60.409694 -60.086985 -56.711489 -51.672911 -42.193441
## Jun Jul Aug Sep Oct
## 2000 -19.505662 -19.647836 -19.709423 -19.740683 -19.760895
## 2001 32.782089 -4.493858 -28.346272 20.902271 -9.232743
## 2002 12.239330 -42.110521 -54.217819 1.108370 -42.254380
## 2003 -17.047984 -59.135007 -50.539326 -20.192136 -54.261378
## 2004 -33.215939 -54.841724 -42.199865 -31.964886 -57.560147
## 2005 -47.169646 -56.925528 -50.678588 -48.028407 -60.107645
## 2006 -47.436859 -54.063278 -51.318725 -48.673067 -52.119936
## 2007 -53.906868 -56.250546 -57.516933 -45.543389 -55.050814
## 2008 -51.641968 -54.658515 -49.053958 -57.857195 -56.648718
## 2009 -54.194310 -56.274781 -58.521807 -52.684202 -59.216833
## 2010 -58.790996 -57.981531 -61.734323 -67.188984 -59.295454
## 2011 -52.867176 -63.560171 -58.469123 -55.470063 -52.500305
## 2012 -47.421900 -39.877735 -53.105350 -45.998019 -56.422362
## 2013 -58.496624 -68.535778 -59.064140 -65.219589 -57.495593
## 2014 -56.323280 -60.956352 -61.309081 -56.613197 -62.698331
## 2015 -48.815440 -64.212105 -44.022899 -44.226835 -53.144604
## 2016 -58.289453 -50.292390 -50.285565 -45.266509 -46.811745
## 2017 -55.416862 -49.550631 -63.597979 -65.045352 -51.241549
## Nov Dec
## 2000 -19.588185 -17.682287
## 2001 -44.370116 15.359414
## 2002 -77.324266 9.635419
## 2003 -67.305151 -17.183525
## 2004 -60.187812 -45.495841
## 2005 -58.136819 -52.109853
## 2006 -57.327953 -50.271398
## 2007 -54.196324 -49.220898
## 2008 -57.828290 -59.019607
## 2009 -57.396712 -58.702715
## 2010 -53.531467 -56.656636
## 2011 -54.211815 -57.555366
## 2012 -72.266387 -62.710972
## 2013 -61.196283 -66.823191
## 2014 -60.070073 -56.267072
## 2015 -57.257195 -61.938140
## 2016 -46.098064 -57.416653
## 2017 -37.091123 -49.235542
## But the acf and pacf pattern are nearly equal
par(mfrow = c(2,2));acf(as.vector(res_sarima),
lag = 60, main = 'ACF : Residuals of SARIMA');pacf(as.vector(res_sarima),
lag = 60, main = 'PACF : Residuals of SARIMA');acf(as.vector(res_stoch_arma_auto),
lag = 60, main = 'ACF : Residuals of whole split models previously');pacf(as.vector(res_stoch_arma_auto),
lag = 60, main = 'PACF : Residuals of whole split models previously')
dummy = seasonaldummy(ts(data_ts,f=12))
var_time = time(data_ts)
xreg = model.matrix(~dummy + var_time)
model_det_arima = Arima(y = data_ts,
xreg = xreg ,
include.mean=FALSE ,
order = c(1,0,1),
seasonal=list(order=c(1,0,1), period=12)
)
model_det_arima_fit = (data_ts - model_det_arima $residuals)
res_det_arima = model_det_arima$residuals
Determinist de-trend + Stochastic de-season + SARIMA
xreg = time(data_ts)
model_mix_arima = Arima(y = data_ts,
xreg = xreg,
order = c(1,0,1),
seasonal=list(order=c(1,1,1), period=12)
)
model_mix_arima_fit = (data_ts - model_mix_arima$residuals)
res_mix_arima = model_mix_arima$residuals
Stochastic de-trend + Determinist de-season + SARIMA I met some Non-Stationary Problems
# dummy = seasonaldummy(ts(data_ts,f=12))
# xreg = model.matrix(~dummy)
#
# model_mix_arima = Arima(y =data_ts,
# xreg = xreg,
# order = c(1,0,1),
# include.mean=FALSE ,
# seasonal=list(order=c(1,0,1),
# period=12)
# )
#
# model_mix_arima_fit = (data_ts - model_mix_arima$residuals)
# res_mix_arima = model_mix_arima$residuals
pred_sarima = forecast(model_sarima, h = forecast_num)
plot(pred_sarima);lines(pred_sarima$fitted,col="red");lines(target_data_ts,col="black")
test_dummy = seasonaldummy(ts(test_ts,f=12))
test_var_time = time(test_ts)
test_xreg = model.matrix(~test_dummy + test_var_time)
pred_det_arima = forecast(model_det_arima,
h = forecast_num,
xreg = test_xreg)
## Warning in forecast.Arima(model_det_arima, h = forecast_num, xreg =
## test_xreg): xreg contains different column names from the xreg used in
## training. Please check that the regressors are in the same order.
plot(pred_det_arima);lines(pred_det_arima$fitted,col="red");lines(target_data_ts,col="black")
test_xreg = time(test_ts)
pred_mix_arima = forecast(model_mix_arima,
h = forecast_num,
xreg = test_xreg)
plot(pred_mix_arima);lines(pred_mix_arima$fitted,col="red");lines(target_data_ts,col="black")
Thanks for reading !